/*    Copyright 2010 Tobias Marschall
 *
 *    This file is part of MoSDi.
 *
 *    MoSDi is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    MoSDi is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with MoSDi.  If not, see <http://www.gnu.org/licenses/>.
 */

package mosdi.subcommands;

import java.util.List;

import mosdi.distributions.DistributionConvolver;
import mosdi.distributions.LazyDistributionConvolver;
import mosdi.fa.CDFA;
import mosdi.fa.DFAFactory;
import mosdi.fa.FiniteMemoryTextModel;
import mosdi.fa.IIDTextModel;
import mosdi.fa.MarkovianTextModel;
import mosdi.fa.ProductTextModel;
import mosdi.paa.DAA;
import mosdi.paa.PAA;
import mosdi.paa.TextBasedPAA;
import mosdi.paa.apps.IndexSumDAA;
import mosdi.paa.apps.MatchAnnotationDAA;
import mosdi.util.Alphabet;
import mosdi.util.Log;
import mosdi.util.NamedSequence;
import mosdi.util.ProductEncoder;
import mosdi.util.SequenceUtils;


public class AnnotationStatSubcommand extends Subcommand {

	@Override
	public String usage() {
		return 
		super.usage()+" [options] <sequence.fa> <annotation.fa> <model-type> <statistic> <IUPAC-pattern>\n" +
		"\n" +
		"From <sequence.fa> and <annotation.fa>, a joint model for sequence and\n" +
		"annotation is estimated (that's the only purpose of these files).\n" +
		"Then, the distribution of the annotation sum at match position of\n" +
		"<IUPAC-pattern> is computed (assuming that the pattern occurs\n" +
		"<occurrences> times).\n" +
		"\n" +
		"<model-type>:\n" +
		"  \"joint\":       text and annotation model are estimated jointly\n" +
		"                 (requires more sequence/annotation data for robust\n" +
		"                 estimation)\n" +
		"  \"independent\": estimate models indepentently and create joint model\n" +
		"                 by using a product construction. Assumption: annotation\n" +
		"                 is independent of sequences\n" +
		"\n" +
		"<statistic>:\n" +
		"  \"exact\":        (slowest) construct PAA and compute exact distribution\n" +
		"  \"comp-poisson\": (faster) use compound Poisson model (faster)\n" +
		"  \"indep-occ\":    (fastest) assume independent occurrences\n" +
		"\n" +
		"Options:\n" +
		"  -O <model-order>: Order of annotation model (default: 1)\n" +
		"  -M <model-order>: Order of text model (default: 1)\n" +
		"  -p <pseudo-counts>: Pseudocounts for estimation of annotation model (default: 0.1)\n" +
		"  -b <bincount>: Number of bins for discretization of annotation scores (default: 11)\n" +
		"  -r: consider reverse complement\n" +
		"  -m <max-annotation>: maximum annotation sum of interest (default: 100)\n" +
		"  -n <occurrences>: number of occurrences (default: 10)\n" +
		"  -t <textlength>: (default: 100)";
	}
	
	@Override
	public String description() {
		return "Computes (exact or approximate) distribution of annotation sums at match positions.";
	}

	@Override
	public String name() {
		return "annotation-sum-dist";
	}
	
	@Override
	public int run(String[] args) {
		parseOptions(args, 5, "O:M:p:b:rm:n:t:");

		// Option dependencies
		// -- none --

		// Mandatory arguments
		String sequenceFilename = getStringArgument(0);
		String annotationFilename = getStringArgument(1);
		String modelName = getStringArgument(2);
		String statisticName = getStringArgument(3);
		String pattern = getStringArgument(4);

		// Options
		final Alphabet dnaAlphabet = Alphabet.getDnaAlphabet();
		double pseudoCounts = getDoubleOption("p", 0.1);
		int annotationModelOrder = getNonNegativeIntOption("O",1);
		int textModelOrder = getNonNegativeIntOption("M",1);
		int binCount = getPositiveIntOption("b", 11);
		boolean considerReverse = getBooleanOption("r", false);
		int maxAnnotationSum = getPositiveIntOption("m", 100); 
		int occurrences = getNonNegativeIntOption("n", 10);
		int textLength = getPositiveIntOption("t", 100);
		
		if (!modelName.equals("joint") && !modelName.equals("independent")) {
			Log.errorln("Invalid <model-type>: \""+modelName+"\".");
			System.exit(1);
		}
		if (!statisticName.equals("exact") && !statisticName.equals("comp-poisson") && !statisticName.equals("indep-occ")) {
			Log.errorln("Invalid <statistic>: \""+statisticName+"\".");
			System.exit(1);
		}
		
		Log.startTimer();
		List<NamedSequence> namedSequences = null;
		try {
			namedSequences = SequenceUtils.readFastaFile(sequenceFilename, dnaAlphabet, true);
		} catch (Exception e) {
			Log.errorln(e.toString());
			System.exit(1);
		}
		Log.restartTimer("Reading FASTA file");
		SequenceUtils.readAnnotationTrack("annotation0", annotationFilename, namedSequences, binCount);
		Log.restartTimer("Reading annotation track");
		
		// construct joint model
		FiniteMemoryTextModel jointTextModel = null;
		FiniteMemoryTextModel annotationModel = null;
		ProductEncoder productAlphabet = new ProductEncoder(dnaAlphabet.size(),binCount);
		if (modelName.equals("joint")) {
			if (statisticName.equals("indep-occ")) {
				Log.errorln("Statistic \"indep-occ\" cannot be combined with model type \"joint\"");
				System.exit(1);
			}
			if (annotationModelOrder!=textModelOrder) {
				Log.errorln("For model type \"joint\", order of text and annotation model must be equal (options -M and -O).");
				System.exit(1);
			}
			int[] jointSequence = SequenceUtils.concatAndJoin(namedSequences, productAlphabet, "annotation0", false);
			jointTextModel = (annotationModelOrder==0)?new IIDTextModel(productAlphabet.getValueCount(), jointSequence, pseudoCounts):new MarkovianTextModel(annotationModelOrder,productAlphabet.getValueCount(),jointSequence,pseudoCounts);
		}
		if (modelName.equals("independent")) {
			int[] sequences = SequenceUtils.concatSequences(namedSequences, considerReverse);
			int[] annotations = SequenceUtils.concatAnnotations(namedSequences, "annotation0", considerReverse);
			FiniteMemoryTextModel textModel = (annotationModelOrder==0)?new IIDTextModel(dnaAlphabet.size(), sequences, pseudoCounts):new MarkovianTextModel(annotationModelOrder,dnaAlphabet.size(),sequences,pseudoCounts);
			annotationModel = (annotationModelOrder==0)?new IIDTextModel(binCount, annotations, pseudoCounts):new MarkovianTextModel(annotationModelOrder,binCount,annotations,pseudoCounts);
			jointTextModel = new ProductTextModel(textModel,annotationModel);
		}
		Log.restartTimer("Estimating text/annotation model");
		CDFA cdfa = DFAFactory.buildFromIupacPattern(pattern, considerReverse);
		Log.stopTimer("Building CDFA");
		if (statisticName.equals("exact")) {
			Log.startTimer();
			MatchAnnotationDAA daa = new MatchAnnotationDAA(binCount, cdfa, pattern.length(), occurrences+1, maxAnnotationSum);	
			TextBasedPAA paa = new TextBasedPAA(daa, jointTextModel);
			Log.printf(Log.Level.VERBOSE, "PAA states: %d\n", paa.getStateCount());
			Log.printf(Log.Level.VERBOSE, "PAA values: %d\n", paa.getValueCount());
			Log.restartTimer("Constructing PAA.");
			double[] valueDistribution = paa.computeValueDistribution(textLength);
			Log.stopTimer("Computing state-value distribution of PAA.");
			Log.printf(Log.Level.STANDARD, "Exact distribution of P(A | O=%d), where A is the r.v giving the annotation\n",occurrences);
			Log.printf(Log.Level.STANDARD, "sum at positions of matches and O gives the number of occurrences:\n");
			StringBuffer sb = new StringBuffer();
			sb.append(">>");
			double sum = 0.0;
			for (int i=0; i<=maxAnnotationSum; ++i) {
				sum += valueDistribution[daa.getValue(occurrences, i)];
			}
			for (int i=0; i<=maxAnnotationSum; ++i) {
				sb.append(" "+(valueDistribution[daa.getValue(occurrences, i)]/sum));
			}
			Log.println(Log.Level.STANDARD, sb.toString());
		}
		if (statisticName.equals("comp-poisson")){
			Log.println(Log.Level.STANDARD,"Not yet implemented!");
			System.exit(1);
			// ClumpAnnotationSumCalculator casc = new ClumpAnnotationSumCalculator(jointTextModel, productAlphabet, cdfa, pattern.length());
			// double[] clumpSumDistribution = casc.annotationSumDistribution(maxAnnotationSum, 1e-300);
			// TODO TODO TODO
		}
		if (statisticName.equals("indep-occ")){
			DAA daa = new IndexSumDAA(annotationModel.getAlphabetSize(), pattern.length()*(binCount-1));
			PAA paa = new TextBasedPAA(daa, annotationModel);
			double[] scoreDistribution = paa.computeValueDistribution(pattern.length());
			DistributionConvolver dc =  new LazyDistributionConvolver(scoreDistribution);
			double[] convolvedDistribution = dc.getDistribution(occurrences);
			Log.printf(Log.Level.STANDARD, "Approximate distribution of P(A | O=%d), where A is the r.v giving the annotation\n",occurrences);
			Log.printf(Log.Level.STANDARD, "sum at positions of matches and O gives the number of occurrences:\n");
			StringBuffer sb = new StringBuffer();
			sb.append(">>");
			for (int i=0; i<convolvedDistribution.length; ++i) {
				sb.append(" "+convolvedDistribution[i]);
			}
			Log.println(Log.Level.STANDARD,sb.toString());
		}
		return 0;
	}

}
